In [1]:
import os
import shutil
import subprocess
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from IPython.display import display_html
from IPython.core.display import HTML
from matplotlib.patches import PathPatch
from matplotlib.colors import Colormap

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from scipy.stats import probplot

# Indicates whether the notebook is running on Kaggle or not.
ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None
MODELS_PATH = Path("models")
MODELS_PATH.mkdir(exist_ok=True)  # If directory exists, do nothing.

FONT_COLOR = "#0b1320"
BACKGROUND_COLOR = "#F6F5F5"
DF_CMAP: Colormap = sns.light_palette("#204254", as_cmap=True)  # type: ignore

MY_RC = {
    "axes.labelcolor": FONT_COLOR,
    "axes.labelsize": 10,
    "axes.labelpad": 15,
    "axes.labelweight": "bold",
    "axes.titlesize": 14,
    "axes.titleweight": "bold",
    "axes.titlepad": 15,
    "axes.facecolor": BACKGROUND_COLOR,
    "xtick.labelsize": 10,
    "xtick.color": FONT_COLOR,
    "ytick.labelsize": 10,
    "ytick.color": FONT_COLOR,
    "figure.titlesize": 14,
    "figure.titleweight": "bold",
    "figure.facecolor": BACKGROUND_COLOR,
    "figure.edgecolor": BACKGROUND_COLOR,
    "figure.dpi": 72,  # Locally Seaborn uses 72, meanwhile Kaggle 96.
    "font.size": 10,
    "font.family": "Serif",
    "text.color": FONT_COLOR,
}

sns.set_theme(rc=MY_RC)


def download_from_kaggle(competition):
    command = "kaggle competitions download -c "
    filepath = Path("data/" + competition + ".zip")

    if not filepath.is_file():
        subprocess.run((command + competition).split())
        Path("data").mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(competition + ".zip", "data")
        shutil.move(competition + ".zip", "data")


def adjust_box_widths(g, factor, orient="v"):
    """
    Adjust the widths/heights of a seaborn-generated boxplot.
    """
    if orient not in ("v", "h"):
        raise ValueError("The `orient` should be 'v' or 'h'.")

    i = 1
    if orient == "h":
        i = 0

    # iterating through Axes instances
    for ax in g.axes:
        # iterating through axes artists:
        for c in ax.get_children():
            # searching for PathPatches
            if isinstance(c, PathPatch):
                # getting current height/width of box:
                p = c.get_path()
                verts = p.vertices
                verts_sub = verts[:-1]
                min_ = np.min(verts_sub[:, i])
                max_ = np.max(verts_sub[:, i])
                mid_ = 0.5 * (min_ + max_)
                half_ = 0.5 * (max_ - min_)

                # setting new height of box
                min_new_ = mid_ - factor * half_
                max_new_ = mid_ + factor * half_
                verts_sub[verts_sub[:, i] == min_, i] = min_new_
                verts_sub[verts_sub[:, i] == max_, i] = max_new_

                # setting new height/width of median line
                if orient == "v":
                    for l in ax.lines:
                        if len(l.get_ydata()) == 2 and np.all(
                            l.get_ydata() == [min_, max_]
                        ):
                            l.set_ydata([min_new_, max_new_])
                elif orient == "h":
                    for l in ax.lines:
                        if len(l.get_xdata()) == 2 and np.all(
                            l.get_xdata() == [min_, max_]
                        ):
                            l.set_xdata([min_new_, max_new_])


HTML(
    """
<style>
code {
    background: rgba(42, 53, 125, 0.1) !important;
    border-radius: 4px !important;
}
</style>
"""
)
Out[1]:
In [2]:
train = pd.read_csv("train.csv", index_col="PassengerId")
test = pd.read_csv("test.csv", index_col="PassengerId")
In [ ]:
 
In [3]:
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
In [ ]:
 
In [4]:
categorical = train.select_dtypes("object").columns
numerical = train.select_dtypes("number").columns

print("Categorical:".ljust(15), f"{list(categorical)}".ljust(60), len(categorical))
print("Numerical:".ljust(15), f"{list(numerical)}".ljust(60), len(numerical))
Categorical:    ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']               5
Numerical:      ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']      6
In [ ]:
 
In [5]:
print("Train Dataset NaNs:")
print(train.isna().sum()[train.isna().sum() > 0])
print()
print("Test Dataset NaNs:")
print(test.isna().sum()[test.isna().sum() > 0])
Train Dataset NaNs:
Age         177
Cabin       687
Embarked      2
dtype: int64

Test Dataset NaNs:
Age       86
Fare       1
Cabin    327
dtype: int64
In [ ]:
 
In [6]:
num_description = (
    train.describe(percentiles=[0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99])
    .drop("count")
    .rename(index=str.title)
    .style.background_gradient(DF_CMAP)
)

num_description
Out[6]:
  Survived Pclass Age SibSp Parch Fare
Mean 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
Std 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
Min 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
1% 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000
5% 0.000000 1.000000 4.000000 0.000000 0.000000 7.225000
25% 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
95% 1.000000 3.000000 56.000000 3.000000 2.000000 112.079150
99% 1.000000 3.000000 65.870000 5.000000 4.000000 249.006220
Max 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [ ]:
 
In [7]:
sns.clustermap(
    train.corr(),
    linecolor=BACKGROUND_COLOR,
    linewidth=10,
    annot=True,
    cmap=DF_CMAP,
    tree_kws={"linewidths": 1.5, "color": "#141B4D"},
    annot_kws={"fontsize": 12},
    figsize=(11, 11),
)
plt.show()
In [ ]:
 
In [8]:
grid = sns.PairGrid(train.drop("Survived", axis=1), diag_sharey=False)

grid.fig.set_facecolor(BACKGROUND_COLOR)
grid.fig.set_size_inches(11.7, 11.7)

grid.map_upper(sns.scatterplot, color="#394d5f", marker="+", s=10)
grid.map_diag(sns.histplot, color="#204254", bins=20)
grid.map_lower(sns.kdeplot, levels=10, color="#e8ba91")

plt.show()
In [ ]:
 
In [9]:
survived_mask = train["Survived"] == 1
features = ["Pclass", "Age", "SibSp", "Parch", "Fare"]

fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(11.7, 15), tight_layout=True)
kde_kw = dict(fill=True, linewidth=2, alpha=0.2)
hist_kw = dict(
    linewidth=2, alpha=0.75, element="step", fill=False, cumulative=True, stat="density"
)

for feature, ax in zip(features, axes):
    full = train[feature]
    survived = train.loc[survived_mask, feature]
    non_survived = train.loc[~survived_mask, feature]

    plt.sca(ax[0])
    sns.histplot(x=full, color="#204254", stat="density", label="All", alpha=0.75)
    plt.legend(loc="upper right")

    plt.sca(ax[1])
    sns.kdeplot(x=survived, color="#394d5f", label="1", **kde_kw)
    sns.kdeplot(x=non_survived, color="#e8ba91", label="0", **kde_kw)
    plt.ylabel("")
    plt.legend(loc="upper right", title="Survived")

    plt.sca(ax[2])

    sns.histplot(x=survived, color="#394d5f", label="1", **hist_kw)
    sns.histplot(x=non_survived, color="#e8ba91", label="0", **hist_kw)
    plt.legend(loc="lower right", title="Survived")
    plt.ylabel("")

plt.show()
In [ ]:
 
In [10]:
names = {
    "mean": "Mean Survival Rate",
    "sum": "Survivors",
    "count": "Group Size",
}

df1 = (
    train.pivot_table(
        values="Survived",
        index=("Pclass", pd.qcut(train["Age"], 5)),
        aggfunc=["mean", "sum", "count"],
        margins=True,
        margins_name="Total",
    )
    .rename(
        columns=names,
    )
    .droplevel(level=1, axis="columns")
    .style.background_gradient(DF_CMAP)
    .set_table_attributes("style='display:inline'")
)

df2 = (
    train.pivot_table(
        values="Survived",
        index="Pclass",
        aggfunc=["mean", "sum", "count"],
        margins=True,
        margins_name="Total",
    )
    .rename(
        columns=names,
    )
    .droplevel(level=1, axis="columns")
    .style.background_gradient(DF_CMAP)
    .set_table_attributes("style='display:inline'")
)

display_html(df1._repr_html_() + df2._repr_html_(), raw=True)
    Mean Survival Rate Survivors Group Size
Pclass Age      
1 (0.419, 19.0] 0.809524 17 21
(19.0, 25.0] 0.761905 16 21
(25.0, 31.8] 0.666667 16 24
(31.8, 41.0] 0.777778 35 45
(41.0, 80.0] 0.506667 38 75
2 (0.419, 19.0] 0.742857 26 35
(19.0, 25.0] 0.400000 12 30
(25.0, 31.8] 0.416667 15 36
(31.8, 41.0] 0.461538 18 39
(41.0, 80.0] 0.363636 12 33
3 (0.419, 19.0] 0.333333 36 108
(19.0, 25.0] 0.197674 17 86
(25.0, 31.8] 0.283582 19 67
(31.8, 41.0] 0.166667 10 60
(41.0, 80.0] 0.088235 3 34
Total 0.383838 342 891
  Mean Survival Rate Survivors Group Size
Pclass      
1 0.629630 136 216
2 0.472826 87 184
3 0.242363 119 491
Total 0.383838 342 891
In [ ]:
 
In [11]:
df = train.copy()
df.loc[df["Fare"] > 500, "Fare"] = df["Fare"].median()
df[["Survived", "Sex", "Pclass"]] = df[["Survived", "Sex", "Pclass"]].astype("category")

fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(11.7, 5), tight_layout=True)

palette = {0: "#e8ba91", 1: "#394d5f"}
medianprops = {"color": "#204254", "alpha": 1}
flierprops = {"marker": "x", "mec": "#204254", "mfc": "#204254"}
kw = {"medianprops": medianprops, "flierprops": flierprops, "palette": palette}

sns.boxplot(data=df, x="Age", y="Pclass", hue="Survived", ax=ax1, **kw)
sns.boxplot(data=df, x="Fare", y="Pclass", hue="Survived", ax=ax2, **kw)

for patch in np.r_[ax1.patches, ax2.patches]:
    col = patch.get_facecolor()
    patch.set_alpha(0.8)

ax1.legend("", frameon=False)
ax2.set_ylabel("")
ax2.set_yticks([])

adjust_box_widths(fig, 0.8, "v")
plt.show()
In [ ]:
 
In [ ]:
 
In [12]:
df = train.copy()
df["Sex"] = df["Sex"].str.title()

df.groupby("Sex", group_keys=False).apply(
    lambda sex: sex.pivot_table(
        values="Survived",
        index=("Sex", "Embarked", "Pclass"),
        aggfunc=["mean", "sum", "count"],
        margins=True,
        margins_name=f"{sex.name} Total",
    )
    .rename(
        columns=names,
    )
    .droplevel(level=1, axis="columns")
).style.background_gradient(DF_CMAP)
Out[12]:
      Mean Survival Rate Survivors Group Size
Sex Embarked Pclass      
Female C 1 0.976744 42 43
2 1.000000 7 7
3 0.652174 15 23
Q 1 1.000000 1 1
2 1.000000 2 2
3 0.727273 24 33
S 1 0.958333 46 48
2 0.910448 61 67
3 0.375000 33 88
Female Total 0.740385 231 312
Male C 1 0.404762 17 42
2 0.200000 2 10
3 0.232558 10 43
Q 1 0.000000 0 1
2 0.000000 0 1
3 0.076923 3 39
S 1 0.354430 28 79
2 0.154639 15 97
3 0.128302 34 265
Male Total 0.188908 109 577
In [ ]:
 
In [13]:
df = train.copy()
df["Survived"] = df["Survived"].map({1: "Survived", 0: "Died"})
df["Sex"] = df["Sex"].map({"male": "Male", "female": "Female"})

fig = px.sunburst(
    data_frame=df,
    title="Passengers Onboard",
    path=["Sex", "Survived"],
    color_discrete_sequence=["#394d5f", "#e8ba91"],
    height=640,
    width=640,
)
fig.update_traces(
    textinfo="label+percent parent",
    insidetextorientation="horizontal",
    marker_line_width=10,
    marker_line_color=BACKGROUND_COLOR,
)
fig.update_layout(
    font_color=FONT_COLOR,
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
)
fig.show()
In [ ]:
 
In [ ]:
 
In [14]:
df = train.copy()
df["IsAlone"] = df.eval("SibSp + Parch") == 0

df.pivot_table(
    values="Survived",
    index="IsAlone",
    aggfunc=["mean", "sum", "count"],
    margins=True,
    margins_name="Total",
).rename(columns=names,).droplevel(level=1, axis="columns").style.background_gradient(
    DF_CMAP
)
Out[14]:
  Mean Survival Rate Survivors Group Size
IsAlone      
False 0.505650 179 354
True 0.303538 163 537
Total 0.383838 342 891
In [ ]:
 
In [15]:
df["Survived"] = df["Survived"].map({1: "Survived", 0: "Died"})
df["IsAlone"] = df["IsAlone"].map({True: "Alone", False: "NotAlone"})

fig = px.sunburst(
    data_frame=df,
    title="Alone Passengers Onboard",
    path=["IsAlone", "Survived"],
    color_discrete_sequence=["#394d5f", "#e8ba91"],
    height=640,
    width=640,
)
fig.update_traces(
    textinfo="label+percent parent",
    insidetextorientation="horizontal",
    marker_line_width=10,
    marker_line_color=BACKGROUND_COLOR,
)
fig.update_layout(
    font_color=FONT_COLOR,
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
)
fig.show()
In [ ]:
 
In [16]:
df = train.copy()
df["Title"] = df["Name"].str.extract(r" ([A-Za-z]+)\.")
unique_titles = np.setdiff1d(df["Title"], ["Mr", "Miss", "Mrs", "Master"]).tolist()
df["Title"] = df["Title"].replace(unique_titles, "Other")

titles_info = (
    pd.crosstab(
        df["Title"],
        df["Survived"],
        normalize="index",
    )
    .join(df["Title"].value_counts())
    .rename(
        columns={
            0: "Death Rate",
            1: "Survival Rate",
            "Title": "Group Size",
        }
    )
)

titles_info.style.background_gradient(DF_CMAP)
Out[16]:
  Death Rate Survival Rate Group Size
Title      
Master 0.425000 0.575000 40
Miss 0.302198 0.697802 182
Mr 0.843327 0.156673 517
Mrs 0.208000 0.792000 125
Other 0.555556 0.444444 27
In [17]:
melted = titles_info.reset_index().melt(
    id_vars="Title",
    value_vars=["Death Rate", "Survival Rate"],
    var_name="Rate",
    value_name="Value",
)

plt.figure(figsize=(11, 6), tight_layout=True)
palette = {"Death Rate": "#e8ba91", "Survival Rate": "#394d5f"}
sns.barplot(data=melted, x="Title", y="Value", hue="Rate", palette=palette, alpha=0.8)
plt.axhline(df["Survived"].mean(), linewidth=2, color="#204254", linestyle="--")
plt.text(2.07, 0.41, "Mean Survival Rate", fontsize=11)
plt.legend(loc="upper left")
plt.show()
In [ ]:
 
In [ ]:
 
In [18]:
df = train.copy()
df["Cabin"] = df["Cabin"].str[0]

df.pivot_table(
    values="Survived",
    index=["Cabin", "Pclass"],
    aggfunc=["mean", "sum", "count"],
    margins=True,
    margins_name="Total",
).rename(columns=names).droplevel(level=1, axis="columns").style.background_gradient(
    DF_CMAP
)
Out[18]:
    Mean Survival Rate Survivors Group Size
Cabin Pclass      
A 1 0.466667 7 15
B 1 0.744681 35 47
C 1 0.593220 35 59
D 1 0.758621 22 29
2 0.750000 3 4
E 1 0.720000 18 25
2 0.750000 3 4
3 1.000000 3 3
F 2 0.875000 7 8
3 0.200000 1 5
G 3 0.500000 2 4
T 1 0.000000 0 1
Total 0.666667 136 204
In [ ]:
 
In [19]:
plt.figure(figsize=(11, 6), tight_layout=True)
palette = {1: "#e8ba91", 2: "#394d5f", 3: "#204254"}
sns.barplot(
    data=df,
    x="Cabin",
    y="Survived",
    hue="Pclass",
    alpha=0.8,
    errorbar=None,
    palette=palette,
)
plt.axhline(df["Survived"].mean(), linewidth=2, color="#204254", linestyle="--")
plt.text(6.20, 0.42, "Mean Survival Rate")
plt.legend(loc="upper left", title="Pclass")
plt.show()
In [ ]:
 
In [20]:
tsne_preprocess = make_column_transformer(
    (
        make_pipeline(SimpleImputer(strategy="median"), StandardScaler()),
        ["Age"],
    ),
    (
        make_pipeline(
            SimpleImputer(strategy="median"),
            FunctionTransformer(func=np.log1p, inverse_func=np.expm1),
            StandardScaler(),
        ),
        ["Fare"],
    ),
    (
        make_pipeline(SimpleImputer(strategy="most_frequent"), OrdinalEncoder()),
        ["Pclass", "Sex", "Embarked", "IsAlone"],
    ),
    remainder="drop",
)

df = train.copy()
df["IsAlone"] = df.eval("SibSp + Parch == 0")
labels = train["Survived"].astype("category")

tsne = TSNE(n_components=3, random_state=42)
X_3d = tsne.fit_transform(tsne_preprocess.fit_transform(df))
X_3d = pd.DataFrame(X_3d, columns=["x1", "x2", "x3"], index=labels.index).join(labels)
X_3d.head()
Out[20]:
x1 x2 x3 Survived
PassengerId
1 -6.790534 -5.579925 -1.292992 0
2 10.453622 0.112417 5.057104 1
3 -8.081683 -0.127755 5.887538 1
4 8.922507 -1.792780 -3.436335 1
5 -3.929063 10.164669 -0.584738 0
In [ ]:
 
In [21]:
fig = px.scatter_3d(
    data_frame=X_3d,
    x="x1",
    y="x2",
    z="x3",
    symbol="Survived",
    symbol_sequence=["circle", "diamond"],
    color="Survived",
    color_discrete_sequence=["#e8ba91", "#394d5f"],
    opacity=0.5,
    height=740,
    width=740,
    title="Titanic Survivors - 3D projection with t-SNE",
)

fig.update_layout(
    font_color=FONT_COLOR,
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
)

fig.update_traces(marker_size=4)
fig.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [25]:
# Étape 1: Importation des bibliothèques nécessaires

# Importation des bibliothèques de base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importation des outils de machine learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Configuration pour l'affichage des graphiques
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
Out[25]:
'Librairies importées avec succès !'
In [27]:
# Étape 2: Chargement et exploration initiale des données

# Chargement des données
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
submission_data = pd.read_csv("gender_submission.csv")

# Exploration initiale des données d'entraînement
train_head = train_data.head()
train_description = train_data.describe(include="all")
train_info = train_data.info()

train_head, train_description
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Out[27]:
(   PassengerId  Survived  Pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3         1       3   
 3            4         1       1   
 4            5         0       3   
 
                                                 Name     Sex   Age  SibSp  \
 0                            Braund, Mr. Owen Harris    male  22.0      1   
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
 2                             Heikkinen, Miss. Laina  female  26.0      0   
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
 4                           Allen, Mr. William Henry    male  35.0      0   
 
    Parch            Ticket     Fare Cabin Embarked  
 0      0         A/5 21171   7.2500   NaN        S  
 1      0          PC 17599  71.2833   C85        C  
 2      0  STON/O2. 3101282   7.9250   NaN        S  
 3      0            113803  53.1000  C123        S  
 4      0            373450   8.0500   NaN        S  ,
         PassengerId    Survived      Pclass                     Name   Sex  \
 count    891.000000  891.000000  891.000000                      891   891   
 unique          NaN         NaN         NaN                      891     2   
 top             NaN         NaN         NaN  Braund, Mr. Owen Harris  male   
 freq            NaN         NaN         NaN                        1   577   
 mean     446.000000    0.383838    2.308642                      NaN   NaN   
 std      257.353842    0.486592    0.836071                      NaN   NaN   
 min        1.000000    0.000000    1.000000                      NaN   NaN   
 25%      223.500000    0.000000    2.000000                      NaN   NaN   
 50%      446.000000    0.000000    3.000000                      NaN   NaN   
 75%      668.500000    1.000000    3.000000                      NaN   NaN   
 max      891.000000    1.000000    3.000000                      NaN   NaN   
 
                Age       SibSp       Parch  Ticket        Fare    Cabin  \
 count   714.000000  891.000000  891.000000     891  891.000000      204   
 unique         NaN         NaN         NaN     681         NaN      147   
 top            NaN         NaN         NaN  347082         NaN  B96 B98   
 freq           NaN         NaN         NaN       7         NaN        4   
 mean     29.699118    0.523008    0.381594     NaN   32.204208      NaN   
 std      14.526497    1.102743    0.806057     NaN   49.693429      NaN   
 min       0.420000    0.000000    0.000000     NaN    0.000000      NaN   
 25%      20.125000    0.000000    0.000000     NaN    7.910400      NaN   
 50%      28.000000    0.000000    0.000000     NaN   14.454200      NaN   
 75%      38.000000    1.000000    0.000000     NaN   31.000000      NaN   
 max      80.000000    8.000000    6.000000     NaN  512.329200      NaN   
 
        Embarked  
 count       889  
 unique        3  
 top           S  
 freq        644  
 mean        NaN  
 std         NaN  
 min         NaN  
 25%         NaN  
 50%         NaN  
 75%         NaN  
 max         NaN  )
In [28]:
# Étape 3: Analyse Exploratoire des Données (EDA)

# Visualisation de la distribution des survivants
plt.figure(figsize=(6, 5))
sns.countplot(x='Survived', data=train_data)
plt.title('Distribution des survivants')
plt.xlabel('Survivants')
plt.ylabel('Nombre de passagers')
plt.show()

# Visualisation de la distribution des survivants par sexe
plt.figure(figsize=(6, 5))
sns.countplot(x='Survived', hue='Sex', data=train_data)
plt.title('Distribution des survivants par sexe')
plt.xlabel('Survivants')
plt.ylabel('Nombre de passagers')
plt.legend(title='Sexe')
plt.show()

# Visualisation de la distribution des survivants par classe
plt.figure(figsize=(6, 5))
sns.countplot(x='Survived', hue='Pclass', data=train_data)
plt.title('Distribution des survivants par classe')
plt.xlabel('Survivants')
plt.ylabel('Nombre de passagers')
plt.legend(title='Classe')
plt.show()

# Visualisation de la distribution des âges
plt.figure(figsize=(10, 6))
sns.histplot(train_data['Age'].dropna(), kde=True, bins=30)
plt.title('Distribution des âges')
plt.xlabel('Âge')
plt.ylabel('Nombre de passagers')
plt.show()

# Visualisation de la distribution des tarifs
plt.figure(figsize=(10, 6))
sns.histplot(train_data['Fare'], kde=True, bins=40)
plt.title('Distribution des tarifs')
plt.xlabel('Tarif')
plt.ylabel('Nombre de passagers')
plt.show()

# Visualisation de la distribution du nombre de frères et sœurs/conjoints
plt.figure(figsize=(8, 6))
sns.countplot(x='SibSp', data=train_data)
plt.title('Distribution du nombre de frères et sœurs/conjoints')
plt.xlabel('Nombre de frères et sœurs/conjoints')
plt.ylabel('Nombre de passagers')
plt.show()

# Visualisation de la distribution du nombre de parents/enfants
plt.figure(figsize=(8, 6))
sns.countplot(x='Parch', data=train_data)
plt.title('Distribution du nombre de parents/enfants')
plt.xlabel('Nombre de parents/enfants')
plt.ylabel('Nombre de passagers')
plt.show()
In [29]:
# Étape 4: Prétraitement des données et ingénierie des caractéristiques

# Traitement des valeurs manquantes
# Remplissage des valeurs manquantes dans la colonne "Age"
age_median = train_data['Age'].median()
train_data['Age'].fillna(age_median, inplace=True)

# Remplissage des valeurs manquantes dans la colonne "Embarked" par le mode
embarked_mode = train_data['Embarked'].mode()[0]
train_data['Embarked'].fillna(embarked_mode, inplace=True)

# Suppression de la colonne "Cabin" car elle a trop de valeurs manquantes
train_data.drop('Cabin', axis=1, inplace=True)

# Ingénierie des caractéristiques
# Extraction du titre à partir du nom (M., Mlle, Mme, etc.)
train_data['Title'] = train_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

# Regroupement des titres moins courants sous la catégorie "Other"
rare_titles = ['Dr', 'Rev', 'Col', 'Major', 'Lady', 'Jonkheer', 'Don', 'Capt', 'Sir', 'Countess']
train_data['Title'] = train_data['Title'].replace(rare_titles, 'Other')
train_data['Title'] = train_data['Title'].replace('Mlle', 'Miss')
train_data['Title'] = train_data['Title'].replace('Ms', 'Miss')
train_data['Title'] = train_data['Title'].replace('Mme', 'Mrs')

# Création d'une nouvelle caractéristique "FamilySize" basée sur "SibSp" et "Parch"
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1

# Suppression des colonnes inutiles
train_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)

# Encodage des variables catégorielles
label_encoders = {}
categorical_cols = ['Sex', 'Embarked', 'Title']

for col in categorical_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    label_encoders[col] = le

# Vérification des données après prétraitement
train_data_head = train_data.head()

train_data_head
Out[29]:
Survived Pclass Sex Age SibSp Parch Fare Embarked Title FamilySize
0 0 3 1 22.0 1 0 7.2500 2 2 2
1 1 1 0 38.0 1 0 71.2833 0 3 2
2 1 3 0 26.0 0 0 7.9250 2 1 1
3 1 1 0 35.0 1 0 53.1000 2 3 2
4 0 3 1 35.0 0 0 8.0500 2 2 1
In [30]:
# Étape 5: Séparation des données en ensembles d'entraînement et de validation

X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Vérification de la taille des ensembles d'entraînement et de validation
X_train.shape, X_val.shape, y_train.shape, y_val.shape
Out[30]:
((712, 9), (179, 9), (712,), (179,))
In [31]:
# Étape 6: Choix du modèle de classification

clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Étape 7: Entraînement du modèle

clf.fit(X_train, y_train)
Out[31]:
RandomForestClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [32]:
# Étape 8: Évaluation du modèle

y_pred = clf.predict(X_val)

# Calcul de la précision
accuracy = accuracy_score(y_val, y_pred)

# Rapport de classification et matrice de confusion
report = classification_report(y_val, y_pred)
confusion = confusion_matrix(y_val, y_pred)

accuracy, report, confusion
Out[32]:
(0.8324022346368715,
 '              precision    recall  f1-score   support\n\n           0       0.85      0.87      0.86       105\n           1       0.81      0.78      0.79        74\n\n    accuracy                           0.83       179\n   macro avg       0.83      0.83      0.83       179\nweighted avg       0.83      0.83      0.83       179\n',
 array([[91, 14],
        [16, 58]], dtype=int64))
In [33]:
# Étape 9: Visualisation des résultats

# Visualisation de la matrice de confusion
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='g', cmap='Blues', cbar=False)
plt.xlabel('Prédictions')
plt.ylabel('Valeurs réelles')
plt.title('Matrice de confusion')
plt.xticks([0.5, 1.5], ['Non survécu', 'Survécu'])
plt.yticks([0.5, 1.5], ['Non survécu', 'Survécu'], rotation=0)
plt.show()

# Visualisation de l'importance des caractéristiques
importances = clf.feature_importances_
features = X_train.columns
indices = np.argsort(importances)

plt.figure(figsize=(10, 8))
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Importance')
plt.title('Importance des caractéristiques')
plt.show()
In [ ]: